Data Collection¶
In [ ]:
import pandas as pd
In [ ]:
train_df = pd.read_csv('train.csv')
# test_df = pd.read_csv('test.csv')
In [ ]:
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2000 entries, 0 to 1999 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 battery_power 2000 non-null int64 1 blue 2000 non-null int64 2 clock_speed 2000 non-null float64 3 dual_sim 2000 non-null int64 4 fc 2000 non-null int64 5 four_g 2000 non-null int64 6 int_memory 2000 non-null int64 7 m_dep 2000 non-null float64 8 mobile_wt 2000 non-null int64 9 n_cores 2000 non-null int64 10 pc 2000 non-null int64 11 px_height 2000 non-null int64 12 px_width 2000 non-null int64 13 ram 2000 non-null int64 14 sc_h 2000 non-null int64 15 sc_w 2000 non-null int64 16 talk_time 2000 non-null int64 17 three_g 2000 non-null int64 18 touch_screen 2000 non-null int64 19 wifi 2000 non-null int64 20 price_range 2000 non-null int64 dtypes: float64(2), int64(19) memory usage: 328.3 KB
In [ ]:
train_df.describe()
Out[ ]:
| battery_power | blue | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | ... | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | price_range | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2000.000000 | 2000.0000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | ... | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 |
| mean | 1238.518500 | 0.4950 | 1.522250 | 0.509500 | 4.309500 | 0.521500 | 32.046500 | 0.501750 | 140.249000 | 4.520500 | ... | 645.108000 | 1251.515500 | 2124.213000 | 12.306500 | 5.767000 | 11.011000 | 0.761500 | 0.503000 | 0.507000 | 1.500000 |
| std | 439.418206 | 0.5001 | 0.816004 | 0.500035 | 4.341444 | 0.499662 | 18.145715 | 0.288416 | 35.399655 | 2.287837 | ... | 443.780811 | 432.199447 | 1084.732044 | 4.213245 | 4.356398 | 5.463955 | 0.426273 | 0.500116 | 0.500076 | 1.118314 |
| min | 501.000000 | 0.0000 | 0.500000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.100000 | 80.000000 | 1.000000 | ... | 0.000000 | 500.000000 | 256.000000 | 5.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 851.750000 | 0.0000 | 0.700000 | 0.000000 | 1.000000 | 0.000000 | 16.000000 | 0.200000 | 109.000000 | 3.000000 | ... | 282.750000 | 874.750000 | 1207.500000 | 9.000000 | 2.000000 | 6.000000 | 1.000000 | 0.000000 | 0.000000 | 0.750000 |
| 50% | 1226.000000 | 0.0000 | 1.500000 | 1.000000 | 3.000000 | 1.000000 | 32.000000 | 0.500000 | 141.000000 | 4.000000 | ... | 564.000000 | 1247.000000 | 2146.500000 | 12.000000 | 5.000000 | 11.000000 | 1.000000 | 1.000000 | 1.000000 | 1.500000 |
| 75% | 1615.250000 | 1.0000 | 2.200000 | 1.000000 | 7.000000 | 1.000000 | 48.000000 | 0.800000 | 170.000000 | 7.000000 | ... | 947.250000 | 1633.000000 | 3064.500000 | 16.000000 | 9.000000 | 16.000000 | 1.000000 | 1.000000 | 1.000000 | 2.250000 |
| max | 1998.000000 | 1.0000 | 3.000000 | 1.000000 | 19.000000 | 1.000000 | 64.000000 | 1.000000 | 200.000000 | 8.000000 | ... | 1960.000000 | 1998.000000 | 3998.000000 | 19.000000 | 18.000000 | 20.000000 | 1.000000 | 1.000000 | 1.000000 | 3.000000 |
8 rows × 21 columns
In [ ]:
train_df.hist(figsize=(20,20),color="blue")
Out[ ]:
array([[<Axes: title={'center': 'battery_power'}>,
<Axes: title={'center': 'blue'}>,
<Axes: title={'center': 'clock_speed'}>,
<Axes: title={'center': 'dual_sim'}>,
<Axes: title={'center': 'fc'}>],
[<Axes: title={'center': 'four_g'}>,
<Axes: title={'center': 'int_memory'}>,
<Axes: title={'center': 'm_dep'}>,
<Axes: title={'center': 'mobile_wt'}>,
<Axes: title={'center': 'n_cores'}>],
[<Axes: title={'center': 'pc'}>,
<Axes: title={'center': 'px_height'}>,
<Axes: title={'center': 'px_width'}>,
<Axes: title={'center': 'ram'}>,
<Axes: title={'center': 'sc_h'}>],
[<Axes: title={'center': 'sc_w'}>,
<Axes: title={'center': 'talk_time'}>,
<Axes: title={'center': 'three_g'}>,
<Axes: title={'center': 'touch_screen'}>,
<Axes: title={'center': 'wifi'}>],
[<Axes: title={'center': 'price_range'}>, <Axes: >, <Axes: >,
<Axes: >, <Axes: >]], dtype=object)
Data Preprocessing¶
Data Cleaning¶
In [ ]:
train_df.isnull().sum()
# test_df.isnull().sum()
Out[ ]:
battery_power 0 blue 0 clock_speed 0 dual_sim 0 fc 0 four_g 0 int_memory 0 m_dep 0 mobile_wt 0 n_cores 0 pc 0 px_height 0 px_width 0 ram 0 sc_h 0 sc_w 0 talk_time 0 three_g 0 touch_screen 0 wifi 0 price_range 0 dtype: int64
Feature Engineering¶
In [ ]:
# Screen Area
train_df['sc_a'] = train_df['sc_h']*train_df['sc_w']
# test_df['sc_a'] = test_df['sc_h']*test_df['sc_w']
Data Transformation¶
In [ ]:
# Checking each feature whether it is categorical or numerical feature
for col in train_df.columns:
print(train_df[col].value_counts())
battery_power
1872 6
618 6
1589 6
1715 5
1807 5
..
660 1
1452 1
1005 1
1372 1
858 1
Name: count, Length: 1094, dtype: int64
blue
0 1010
1 990
Name: count, dtype: int64
clock_speed
0.5 413
2.8 85
2.3 78
2.1 76
1.6 76
2.5 74
0.6 74
1.4 70
1.3 68
1.5 67
2.0 67
1.9 65
0.7 64
2.9 62
1.8 62
1.0 61
1.7 60
2.2 59
0.9 58
2.4 58
0.8 58
1.2 56
2.6 55
2.7 55
1.1 51
3.0 28
Name: count, dtype: int64
dual_sim
1 1019
0 981
Name: count, dtype: int64
fc
0 474
1 245
2 189
3 170
5 139
4 133
6 112
7 100
9 78
8 77
10 62
11 51
12 45
13 40
16 24
15 23
14 20
18 11
17 6
19 1
Name: count, dtype: int64
four_g
1 1043
0 957
Name: count, dtype: int64
int_memory
27 47
16 45
14 45
57 42
2 42
..
22 24
38 23
62 21
4 20
59 18
Name: count, Length: 63, dtype: int64
m_dep
0.1 320
0.2 213
0.8 208
0.5 205
0.7 200
0.3 199
0.9 195
0.6 186
0.4 168
1.0 106
Name: count, dtype: int64
mobile_wt
182 28
101 27
185 27
146 26
199 26
..
116 10
140 9
120 9
149 9
96 9
Name: count, Length: 121, dtype: int64
n_cores
4 274
7 259
8 256
2 247
3 246
5 246
1 242
6 230
Name: count, dtype: int64
pc
10 122
7 119
9 112
20 110
1 104
14 104
0 101
2 99
17 99
6 95
4 95
3 93
15 92
12 90
8 89
16 88
13 85
19 83
18 82
11 79
5 59
Name: count, dtype: int64
px_height
347 7
179 6
371 6
275 6
674 5
..
87 1
648 1
341 1
993 1
483 1
Name: count, Length: 1137, dtype: int64
px_width
874 7
1247 7
1383 6
1463 6
1469 6
..
1125 1
1367 1
1569 1
1481 1
1632 1
Name: count, Length: 1109, dtype: int64
ram
1464 4
3142 4
2610 4
2227 4
1229 4
..
2312 1
2167 1
3508 1
297 1
3919 1
Name: count, Length: 1562, dtype: int64
sc_h
17 193
12 157
7 151
16 143
14 143
15 135
13 131
11 126
10 125
9 124
19 124
18 120
8 117
6 114
5 97
Name: count, dtype: int64
sc_w
1 210
3 199
4 182
0 180
5 161
2 156
7 132
6 130
8 125
10 107
9 97
11 84
12 68
13 49
14 33
15 31
16 29
17 19
18 8
Name: count, dtype: int64
talk_time
7 124
4 123
16 116
15 115
19 113
6 111
10 105
8 104
11 103
20 102
14 101
13 100
18 100
9 100
2 99
12 99
17 98
3 94
5 93
Name: count, dtype: int64
three_g
1 1523
0 477
Name: count, dtype: int64
touch_screen
1 1006
0 994
Name: count, dtype: int64
wifi
1 1014
0 986
Name: count, dtype: int64
price_range
1 500
2 500
3 500
0 500
Name: count, dtype: int64
sc_a
0 180
24 44
30 39
10 37
32 34
...
64 4
162 4
26 3
95 2
133 2
Name: count, Length: 127, dtype: int64
From above data inspection, we can see that types of data are Categorical and Numerical.
| Categorical (7+1) | Numerical(14) |
|---|---|
| blue | battery_power |
| dual_sim | clock_speed |
| four_g | fc |
| three_g | int_memory |
| touch_screen | m_dep, mobile_wt |
| wifi | pc |
| n_cores | px_height,px_width |
| price_range | ram |
| sc_h, sc_w, sc_a | |
| talk_time |
Handling numerical data & categorical data¶
In [ ]:
categorical_features = ['price_range','blue','dual_sim','four_g','three_g','touch_screen','wifi','n_cores']
numerical_features = ['battery_power','clock_speed','fc','int_memory','m_dep','mobile_wt','pc','px_height','px_width','ram','sc_h','sc_w','sc_a','talk_time']
In [ ]:
for col in categorical_features:
train_df[col] = train_df[col].astype('category')
Feature scaling¶
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
In [ ]:
fig, axs = plt.subplots(1, len(numerical_features), figsize=(30, 3), layout='constrained')
fig.suptitle('Checking distribution', fontsize=14, fontweight='bold')
for i, col in enumerate(numerical_features):
axs[i].hist(train_df[col])
axs[i].set_title(col)
axs[i].tick_params(axis='x', labelrotation=45)
axs[i].tick_params(axis='y', labelsize=6)
plt.show()
We tend to use Normalized DataFrames because it is not normally distributed.
EDA¶
Univariate analysis¶
In [ ]:
fig, axs = plt.subplots(1, len(numerical_features), figsize=(30, 3), layout='constrained')
fig.suptitle('Distribution', fontsize=14, fontweight='bold')
for i, col in enumerate(numerical_features):
axs[i].hist(train_df[col])
axs[i].set_title(col)
axs[i].tick_params(axis='x', labelrotation=45)
axs[i].tick_params(axis='y', labelsize=6)
plt.show()
In [ ]:
fig, axs = plt.subplots(1,len(categorical_features), figsize=(30, 10), layout='constrained')
fig.suptitle('Distribution', fontsize=14, fontweight='bold')
for i, col in enumerate(categorical_features):
axs[i].bar(train_df[col].value_counts().index, train_df[col].value_counts())
axs[i].set_title(col)
axs[i].tick_params(axis='x', labelrotation=45)
axs[i].tick_params(axis='y', labelsize=8)
plt.show()
Bivariate Analysis¶
In [ ]:
import seaborn as sns
In [ ]:
fig, axs = plt.subplots(1, len(numerical_features), figsize=(30, 10), constrained_layout=True)
fig.suptitle('Relationship between numerical features and the target variable', fontsize=14, fontweight='bold')
# Initialize an empty handles and labels list to collect legend information
handles, labels = [], []
for i, col in enumerate(numerical_features):
ax = sns.boxplot(x='price_range', y=col, hue='price_range', data=train_df, ax=axs[i], palette='pastel')
axs[i].set_title(col)
axs[i].tick_params(axis='x', labelrotation=45)
axs[i].tick_params(axis='y', labelsize=8)
# Remove legend from individual subplot
ax.get_legend().remove()
# Get handles and labels for the legend
if i == 0:
h, l = ax.get_legend_handles_labels()
handles.extend(h)
labels.extend(l)
# Create a single legend outside the subplots
fig.legend(handles, labels, loc='upper left', ncol=len(train_df['price_range'].unique()), frameon=False)
plt.show()
In [ ]:
fig, axs = plt.subplots(1, len(categorical_features), figsize=(30, 5), layout='constrained')
fig.suptitle('relationship between categorical features and the target variable', fontsize=14, fontweight='bold')
for i, col in enumerate(categorical_features):
sns.countplot(x=col, hue='price_range', data=train_df, ax=axs[i])
axs[i].set_title(f'{col} vs price_range')
axs[i].tick_params(axis='x')
axs[i].legend(loc='lower right', title = 'price range')
plt.show()
Correlation Analysis¶
In [ ]:
numerical_features_target = numerical_features.copy()
numerical_features_target.append('price_range')
In [ ]:
# train_data_numerical = pd.concat([x_train_norm[numerical_features], y_train], axis=1)
# correlation_matrix = train_data_numerical.corr()
correlation_matrix = train_df[numerical_features_target].corr()
correlation_with_target = correlation_matrix['price_range'].drop('price_range')
plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix between Numerical Features and Target Variable',fontweight='bold')
plt.show()
print("Correlation with target variable (price_range):")
print(correlation_with_target)
Correlation with target variable (price_range): battery_power 0.200723 clock_speed -0.006606 fc 0.021998 int_memory 0.044435 m_dep 0.000853 mobile_wt -0.030302 pc 0.033599 px_height 0.148858 px_width 0.165818 ram 0.917046 sc_h 0.022986 sc_w 0.038711 sc_a 0.041248 talk_time 0.021859 Name: price_range, dtype: float64
More Visualization¶
In [ ]:
# Pairplot for numerical features
sns.pairplot(data=train_df, vars=numerical_features, hue='price_range')
plt.show()